#importing required libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.metrics import confusion_matrix, recall_score, classification_report, accuracy_score, precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
import graphviz
from IPython.display import display
from sklearn import tree
#read the csv data into pandas dataframe
bank_data = pd.read_csv('bank-full.csv')
bank_data.head()
#get the info of the data which gives the data types
bank_data.info()
#find the shape of the data
bank_data.shape
#Look for any null values exists in the data
bank_data.isnull().sum()
It seems there are no null values in the data
#look for any duplicate records exists in the data
bank_data[bank_data.duplicated()].count()
there are no duplicate records in the data
#the numerical value of the day column has no specific weightage over other value. For example, day 1 of the month and
# day 30th of the month are equal. If we provide the data as-is to the model, it might treat as if day 30th has more weight.
# so converting day column into 4 weeks of the month and dropping day column.
conditions = [bank_data['day']<=7,
(bank_data['day'] > 7) & (bank_data['day']<=14),
(bank_data['day'] > 14) & (bank_data['day']<=21),
bank_data['day'] > 21]
labels = ['firstweek', 'secondweek', 'thirdweek','forthweek']
bank_data['week of the month'] = np.select(conditions, labels, default=np.nan)
bank_data['week of the month'].astype('object')
#Since we mapped day column to week of the month, dropping day column
bank_data.drop('day', axis=1, inplace=True)
bank_data.head()
#Checking pdays column with vaule 999. As 999 means the customer never contacted before
bank_data[bank_data['pdays']==999]
#Since pdays means - number of days that passed by after the client was last contacted from a previous campaign,
# so it cannot be negative. Lets treat the pdays column by taking absolute value of it
bank_data['pdays'] = bank_data['pdays'].abs()
bank_data[bank_data['pdays']<0]
#Checking to see if we have negative balance
bank_data[bank_data['balance'] < 0]
#get the description with 5 point summary and other statistics of the data
bank_data_description = bank_data.describe().transpose()
bank_data_description['range'] = bank_data_description['max'] - bank_data_description['min']
bank_data_description['median'] = bank_data.median()
bank_data_description['IQR'] = bank_data.quantile(0.75) - bank_data.quantile(0.25)
bank_data_description['Skewness'] = bank_data.skew()
bank_data_description
#function to get the list of categorical and numerical columns
def get_categorical_and_numerical_columns():
cols = bank_data.columns
numerical_columns = bank_data._get_numeric_data().columns
categorical_columns = list(set(cols)- set(numerical_columns))
return numerical_columns, categorical_columns
numerical_columns, categorical_columns = get_categorical_and_numerical_columns()
# Finding the value counts of categorical columns
for col in categorical_columns:
print(bank_data[col].value_counts())
# Function to plot distribution plots for numberical columns and count plots for categorical columns
def plotgraphs(data):
plt.rcParams['figure.figsize'] = (15,7)
for col in data.columns:
if col in numerical_columns:
sns.displot(data[col], kind='kde')
else:
sns.countplot(data[col])
plt.show()
plotgraphs(bank_data)
#normalizing numerical data using standard scaler
standardScaler = StandardScaler()
for col in numerical_columns:
bank_data[col] = standardScaler.fit_transform(bank_data[[col]])
bank_data.head()
#Identifying outliers
IQR = bank_data_description['IQR']
Q1 = bank_data.quantile(0.25)
Q3 = bank_data.quantile(0.75)
outlier_row_index = np.where((bank_data< Q1 - 1.5 * IQR) | (bank_data > Q3 + 1.5* IQR))[0]
bank_data_outliers = bank_data.iloc[outlier_row_index, : ]
bank_data_outliers
#Function to get total outliers per feature
def get_outliers(data):
outliers = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).sum()
return outliers
outliers = get_outliers(bank_data)
outliers
#Finding the percentage of outliers
percentage_of_outliers = (outliers/len(bank_data)) * 100
percentage_of_outliers
# Columns campaign, pdays and previous have outliers. lets plot the graphs to see the outliers visually.
columns_with_outliers = ['campaign', 'pdays', 'previous']
#Function to plot outliers
def plot_outliers():
palettes = ['spring', 'rainbow', 'pastel']
plt.rcParams['figure.figsize'] = (10,15)
for index, col in enumerate(columns_with_outliers):
plt.subplot(3, 1, index+1)
sns.boxenplot(bank_data[col], palette=palettes[index])
plt.show()
plot_outliers()
pdays and previous columnscampaign columns has outliers of 1.8%We have significant outliers, we need to treat them because they might cause the model to be overfit and resulting in high variance with test data. Models like Decision tree, Adaboost are very sensitive to outliers. Because its good practice to fix the outliers only in training data instead of fixing the whole data set, we will treat them once we split the data in to test and train.
plt.rcParams['figure.figsize'] = (15,7)
#Function to plot boxplots for continuous variable vs target and count plots for categorical variables vs Target
def plot_graphs_feature_vs_target(data):
for col in data.columns:
if col == 'Target':
continue
if col in numerical_columns:
sns.boxplot(data['Target'], data[col])
else:
sns.countplot(data[col], hue=data['Target'])
plt.show()
plot_graphs_feature_vs_target(bank_data)
management job, subscribed to term deposit more compared to other job typessns.barplot(data=bank_data, x='education', y='age', hue='Target')
plt.show()
People with primary eduction and more age are likely to subscribe for term deposit
sns.barplot(data=bank_data, x='job', y='balance', hue='Target')
plt.show()
People who retired or self-employed with good balance in the account subscribed for term deposit
bank_data.groupby('Target').mean()
pd.crosstab(bank_data['housing'], bank_data['Target'], margins=True)
pd.crosstab(bank_data['loan'], bank_data['Target'], margins=True)
#Map target value from 'yes' and 'no' to 1 and 0 respectively
mapping = {'no':0, 'yes':1}
bank_data['Target'] = bank_data['Target'].replace(mapping)
bank_data['Target'].astype('int64')
bank_data['Target']
#plot Pairplot
sns.pairplot(bank_data, hue='Target')
plt.show()
# Plot heatmaps
plt.figure(figsize=(10,7))
sns.heatmap(bank_data.corr(), annot=True)
There is not much co-relation between features and target variable as well as not much co-relation among features
numerical_columns, categorical_columns = get_categorical_and_numerical_columns()
categorical_columns
I am going to create dummy variables for all the category variables instead of using label encoding. Because label encoding will convert the categorical values in to numerical values. For example: The values of contact column can be label encoded as 1 for cellular, 2 for telephone and 3 for unknown. Which will cause model to misinterpret that there exists an order among the values but they are just categorical values with no relation between them.
bank_data = pd.get_dummies(bank_data, columns=categorical_columns)
bank_data.head()
# Splitting the data into train and test with 70:30 ratio using a random_state = 7
X = bank_data.drop('Target', axis=1)
y = bank_data['Target']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=7)
#Function to return class level data distribution percentage
def find_percent_of_class_dist(data, col):
if isinstance(data, pd.DataFrame):
no_percentage = (len(data[data[col]==0])/len(data))*100
yes_percentage = (len(data[data[col]==1])/len(data))*100
else:
no_percentage = (len(data[data==0])/len(data))*100
yes_percentage = (len(data[data==1])/len(data))*100
return no_percentage, yes_percentage
no_percentage, yes_percentage = find_percent_of_class_dist(bank_data, 'Target')
print('the percentage of no: {} and yes: {} values in the whole data set'.format(no_percentage, yes_percentage))
no_percentage, yes_percentage = find_percent_of_class_dist(y_train, None)
print('the percentage of no: {} and yes: {} values in the train data set'.format(no_percentage, yes_percentage ))
no_percentage, yes_percentage = find_percent_of_class_dist(y_test, None)
print('the percentage of no: {} and yes: {} values in the test data set'.format(no_percentage, yes_percentage ))
We can say the class distribution is same among the whole data, train and test data sets
#get the outliers in training data
outliers_in_train_data = get_outliers(X_train)
outliers_in_train_data
# Now that we split the data into train and test, lets fix the outliers in training data before fitting the model
for col in columns_with_outliers:
q1 = X_train[col].quantile(0.25)
q3 = X_train[col].quantile(0.75)
iqr = q3 - q1
upper_wisker = q3 + 1.5 * iqr
lower_wisker = q1 - 1.5 * iqr
#Replace the lower-end outliers with lower wisker
X_train[col] = np.where(X_train[col] < lower_wisker, lower_wisker, X_train[col])
#Replace the upper-end outliers with upper wisker
X_train[col] = np.where(X_train[col] > upper_wisker, upper_wisker, X_train[col])
outliers_in_train_data_after_capping = get_outliers(X_train)
outliers_in_train_data_after_capping
dTree = DecisionTreeClassifier(criterion='gini', random_state=7)
dTree.fit(X_train, y_train)
print('Train score of Decision tree: {}'. format(dTree.score(X_train, y_train)))
print('Test score of Decision tree: {}'. format(dTree.score(X_test, y_test)))
#Function to visualize the tree
def plot_tree_graph(dTree):
bank_data_tree = open('bank_data_tree.dot','w')
bank_data_dot = tree.export_graphviz(dTree, out_file=bank_data_tree, feature_names=X_train.columns)
bank_data_tree.close()
with open("bank_data_tree.dot") as f:
dot_graph = f.read()
display(graphviz.Source(dot_graph))
plot_tree_graph(dTree)
# The above tree is overfit with training data, so lets prune the tree to avoid overfit
dTree_pruned = DecisionTreeClassifier(criterion='gini', max_depth=4, min_samples_leaf=50, random_state=7)
dTree_pruned.fit(X_train, y_train)
print('Train score of pruned Decision tree: {}'. format(dTree_pruned.score(X_train, y_train)))
print('Test score of pruned Decision tree: {}'. format(dTree_pruned.score(X_test, y_test)))
#plot the pruned tree
plot_tree_graph(dTree_pruned)
#predict using test data
y_predict = dTree_pruned.predict(X_test)
#Function to plot confusion matrix
def plot_confusion_matrix(y_test, y_predict):
conf_matrix = confusion_matrix(y_test, y_predict)
df_cm = pd.DataFrame(conf_matrix, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True ,fmt='g')
plt.show()
plot_confusion_matrix(y_test, y_predict)
#Create a result dataframe with model as index and performance metrics as columns
result_df = pd.DataFrame(columns=['train_score', 'test_score', 'precision', 'recall'])
#Function to update the result Dataframe
def update_result_df(train_score, y_test, y_predict, method_name):
result_df.loc[method_name] = [train_score, accuracy_score(y_test, y_predict), precision_score(y_test, y_predict), recall_score(y_test, y_predict)]
return result_df
result_df = update_result_df(dTree_pruned.score(X_train, y_train), y_test, y_predict, 'Decision Tree')
result_df.head()
print('Decision Tree class level metrics:')
print(classification_report(y_test, y_predict))
bagging_classifier = BaggingClassifier(base_estimator=dTree, n_estimators=50, random_state=7)
bagging_classifier.fit(X_train, y_train)
print('Train score of Bagging Classifier: {}'. format(bagging_classifier.score(X_train, y_train)))
print('Test score of Bagging Classifier: {}'. format(bagging_classifier.score(X_test, y_test)))
#predict using test data
y_predict = bagging_classifier.predict(X_test)
#plot confusion matrix
plot_confusion_matrix(y_test, y_predict)
#update result dataframe
result_df = update_result_df(bagging_classifier.score(X_train, y_train), y_test, y_predict, 'Bagging Classifier')
result_df.head()
print('Bagging classifier class level metrics:')
print(classification_report(y_test, y_predict))
adaboost_classifier = AdaBoostClassifier(n_estimators=100, random_state=7)
adaboost_classifier.fit(X_train, y_train)
print('Train score of AdaBoost Classifier: {}'. format(adaboost_classifier.score(X_train, y_train)))
print('Test score of AdaBoost Classifier: {}'. format(adaboost_classifier.score(X_test, y_test)))
#predict using test data
y_predict = adaboost_classifier.predict(X_test)
#Plot confusion matrix
plot_confusion_matrix(y_test, y_predict)
#update result dataframe
result_df = update_result_df(adaboost_classifier.score(X_train, y_train), y_test, y_predict, 'AdaBoost Classifier')
result_df.head()
print('AdaBoost Classifier class level metrics:')
print(classification_report(y_test, y_predict))
gradientboost_classifier = GradientBoostingClassifier(n_estimators=120, learning_rate=0.3, random_state=7)
gradientboost_classifier.fit(X_train, y_train)
print('Train score of Gradientboost Classifier: {}'. format(gradientboost_classifier.score(X_train, y_train)))
print('Test score of Gradientboost Classifier: {}'. format(gradientboost_classifier.score(X_test, y_test)))
#predict using test data
y_predict = gradientboost_classifier.predict(X_test)
#plot confusion matrix
plot_confusion_matrix(y_test, y_predict)
#update result dataframe
result_df = update_result_df(gradientboost_classifier.score(X_train, y_train), y_test, y_predict, 'GradientBoost Classifier')
result_df.head()
print('GradientBoost Classifier class level metrics:')
print(classification_report(y_test, y_predict))
randomforest_classifier = RandomForestClassifier(criterion='gini', n_estimators=100, random_state=7)
randomforest_classifier.fit(X_train, y_train)
print('Train score of RandomForest Classifier: {}'. format(randomforest_classifier.score(X_train, y_train)))
print('Test score of RandomForest Classifier: {}'. format(randomforest_classifier.score(X_test, y_test)))
#predict using test data
y_predict = randomforest_classifier.predict(X_test)
#plot confusion matrix
plot_confusion_matrix(y_test, y_predict)
#update result dataframe
result_df = update_result_df(randomforest_classifier.score(X_train, y_train), y_test, y_predict, 'RandomForest Classifier')
result_df.head()
print('RandomForest Classifier class level metrics:')
print(classification_report(y_test, y_predict))
The objective of the project is to help the marketing team identify potential customers who are relatively more likely to subscribe to the term deposit and this increase the hit ratio.
For our objective, if the model predicts that a customer will subscribe for term deposit and the customer ended up not subscribing is less costly compared to the model predicted that a customer is not going to subscribe but ended up subscribing for Term Deposit.
Having more False Negatives will be constly for the bank. So we need to reduce Fallse Negatives. Since Recall deals with False Negatives, we need to look at Recall metric while evaluting the model performance.
Recall gives the fraction of predictions that are correctly classified by the model.
It gives a measure of how accurately the model is able to identify the relavant class in our case, Term Deposit Subscription
#finding the importance of features
pd.DataFrame(gradientboost_classifier.feature_importances_, index=X_train.columns, columns=['importance']).sort_values(by='importance', ascending=False)
To increase the number of customers who take term deposit,
It seems customers who subscribed in the previous campaign there is a chance that they might subscribe for term deposit.
The contact method phone or telephone doesn't seems to have an imapact on term deposit subscription
duration, previous outcome other than failure, age of the customer,contacting the customers in month of march, april,jun and october, people with housing loan seems to have good impact on the target variable. Hence, bank should focus more on these features while targetting customers for term deposit subscription.
Other features like marital status, education, job, default, loan, previous, pdays doesn't seems to have much impact on the target variable, Bank should focus less on these features
Banks allows overdraft. I see negative values for account balance
Banks have challenge of collecting proper data, i see there are lot of unknowns in the data
Banks try to run different campaigns time to time and try to personalize those campaigns as per their customer needs